********************************************************************************
* Last update: 6/30/24
* This program reads in and cleans the raw Form 5500 data
********************************************************************************
set more off

global data_dir ".../data/F5500/raw files"

*-------------------------------------------------------------------------------
* Read in 5500 main files
*-------------------------------------------------------------------------------
//1999-2001

forvalues y = 1999(1)2001{

	insheet using "$data_dir/F_5500_`y'/f_5500_`y'.csv", clear

	*drop plans without EIN's 
	drop if spons_dfe_ein==0
	*drop non-pension plans
	keep if type_pension_bnft_code!=""

	*keep identity, participation, and filing information
	keep filing_id spons_dfe_ein spons_dfe_pn form_plan_year_begin_date ///
	type_plan_entity_ind type_dfe_plan_entity type_plan_filing_ind plan_eff_date ///
	collective_bargain_ind business_code tot_partcp_boy_cnt type_pension_bnft_code ///
	tot_active_partcp_cnt rtd_sep_partcp_rcvg_cnt rtd_sep_partcp_fut_cnt ///
	subtl_act_rtd_sep_cnt benef_rcvg_bnft_cnt tot_act_rtd_sep_benef_cnt ///
	partcp_account_bal_cnt sep_partcp_partl_vstd_cnt ssa_filer_partcp_cnt ///
	admin_signed_date spons_signed_date plan_name form_plan_year_begin_date ///
	form_tax_prd

	*fix EIN's wherever possible
	tostring spons_dfe_ein, force replace
	gen temp1=strlen(spons_dfe_ein)

	replace spons_dfe_ein="0"+spons_dfe_ein if temp1==8
	replace spons_dfe_ein="00"+spons_dfe_ein if temp1==7
	replace spons_dfe_ein="000"+spons_dfe_ein if temp1==6
	replace spons_dfe_ein="0000"+spons_dfe_ein if temp1==5
	replace spons_dfe_ein="00000"+spons_dfe_ein if temp1==4
	replace spons_dfe_ein="000000"+spons_dfe_ein if temp1==3
	replace spons_dfe_ein="0000000"+spons_dfe_ein if temp1==2
	replace spons_dfe_ein="00000000"+spons_dfe_ein if temp1==1
	drop temp1

	*fix PN's wherever possible
	tostring spons_dfe_pn, force replace
	gen temp1=strlen(spons_dfe_pn)

	replace spons_dfe_pn="0"+spons_dfe_pn if temp1==2
	replace spons_dfe_pn="00"+spons_dfe_pn if temp1==1
	drop temp1

	*eliminate duplicate filings by keeping the most recent record 
	gsort spons_dfe_ein spons_dfe_pn -admin_signed_date
	//filingnum==1 corresponds to the most recent filing for an EIN-PN-year
	bysort spons_dfe_ein spons_dfe_pn: gen filingnum=_n 
	keep if filingnum==1
	drop filingnum

	*Plan types 
	//Code up DC plans
	foreach j in A B C D E F G H I J K L M N O 0 P Q R{ //O sometimes enters as zero
		gen _2`j'= regexm(type_pension_bnft_code,"2`j'")
	}
	egen DC = rowmax(_2A _2B _2C _2D _2E _2F _2G _2H _2I _2J _2K _2L _2M _2N _2O _20 _2P _2Q _2R)
	drop _2*


	//Code up DB plans 
	foreach j in A B C D E G H{
		gen _1`j'= regexm(type_pension_bnft_code,"1`j'") 
	}
	egen DB = rowmax(_1A _1B _1C _1D _1E _1G _1H)
	drop _1*
	
	//Pure DC, pure DB, or mixed
	gen DC_only= (DC==1 & DB==0)
	gen DB_only= (DC==0 & DB==1)
	gen mixed_plan=1
	replace mixed_plan=0 if (DC_only==1 | DB_only==1)
	
	//cash balance=1C 
	gen cash_balance= regexm(type_pension_bnft_code,"1C")
	
	//freeze
	gen frozen_DB= regexm(type_pension_bnft_code,"1I")
	
	//termination
	gen terminated_DB= regexm(type_pension_bnft_code,"1H")

	*Participant counts
	gen plan_count=0

	*Count DC, DB participants...
	replace plan_count = tot_active_partcp_cnt if DC==1  //includes hybrid DB-DC plans
	replace plan_count = tot_active_partcp_cnt if DB==1 & DC==0 

	*Multiple plan firms
	sort spons_dfe_ein spons_dfe_pn
	bysort spons_dfe_ein: gen firm_plan_num = _n
	bysort spons_dfe_ein: egen num_plans = max(firm_plan_num)

	*tag and drop supplemental plans
	gen supplemental= (regexm(plan_name, "SUPPLEMENTAL") | regexm(plan_name,"PAST SERVICE")) & DB==1
	drop if supplemental==1

	*For a multi plan firm, the max DC enrollment across plans is assumed to be 
	*the total DC enrollment (i.e. all workers that are eligible are eligible for all plans)
	*This includes hybrid DB-DC plans
	bysort spons_dfe_ein: egen DC_enrollment = max(plan_count) if DC==1 
	replace DC_enrollment=0 if DC_enrollment==.

	*For a multi plan firm, sum of DB enrollment is assumed to 
	*reflect the total DB enrollment. (i.e. plans are created for specific workers)
	bysort spons_dfe_ein: egen DB_enrollment = sum(plan_count) if DB==1 & DC==0
	replace DB_enrollment=0 if DB_enrollment==.

	*keep key vars
	keep filing_id spons_dfe_ein spons_dfe_pn  ///
	type_plan_entity_ind type_dfe_plan_entity type_plan_filing_ind ///
	collective_bargain_ind plan_eff_date DC cash_balance terminated_DB ///
	frozen_DB DB plan_count firm_plan_num num_plans DC_enrollment DB_enrollment ///
	type_pension_bnft_code business_code DB_only DC_only mixed_plan form_plan_year_begin_date ///
	form_tax_prd
	
	replace business_code=0 if business_code==.
	tostring business_code, force replace
	
	if `y'<2006{
		tostring plan_eff_date form_plan_year_begin_date form_tax_prd, force replace
	}
	
	gen plan_start_date = date(form_plan_year_begin_date,"YMD")
	gen plan_end_date = date(form_tax_prd,"YMD")
	gen plan_start_yr = year(plan_start_date)
	gen plan_start_qtr = quarter(plan_start_date)
	gen plan_end_yr = year(plan_end_date)
	gen plan_end_qtr = quarter(plan_end_date)

	*Add admin year
	gen year=`y'

	*save
	save "$data_dir/cleaned/f_5500_`y'c.dta", replace

}

*Note:
*Starting in 2002 DB plans that are frozen are given their own code: 1I

//2002-2008
forvalues y = 2002(1)2008{

	insheet using "$data_dir/F_5500_`y'/f_5500_`y'.csv", clear

	*drop plans without EIN's 
	drop if spons_dfe_ein==0
	*drop non-pension plans
	keep if type_pension_bnft_code!=""

	*keep identity, participation, and filing information
	keep filing_id spons_dfe_ein spons_dfe_pn form_plan_year_begin_date ///
	type_plan_entity_ind type_dfe_plan_entity type_plan_filing_ind plan_eff_date ///
	collective_bargain_ind business_code tot_partcp_boy_cnt type_pension_bnft_code ///
	tot_active_partcp_cnt rtd_sep_partcp_rcvg_cnt rtd_sep_partcp_fut_cnt ///
	subtl_act_rtd_sep_cnt benef_rcvg_bnft_cnt tot_act_rtd_sep_benef_cnt ///
	partcp_account_bal_cnt sep_partcp_partl_vstd_cnt ssa_filer_partcp_cnt ///
	admin_signed_date spons_signed_date plan_name form_plan_year_begin_date ///
	form_tax_prd

	*fix EIN's wherever possible
	tostring spons_dfe_ein, force replace
	gen temp1=strlen(spons_dfe_ein)

	replace spons_dfe_ein="0"+spons_dfe_ein if temp1==8
	replace spons_dfe_ein="00"+spons_dfe_ein if temp1==7
	replace spons_dfe_ein="000"+spons_dfe_ein if temp1==6
	replace spons_dfe_ein="0000"+spons_dfe_ein if temp1==5
	replace spons_dfe_ein="00000"+spons_dfe_ein if temp1==4
	replace spons_dfe_ein="000000"+spons_dfe_ein if temp1==3
	replace spons_dfe_ein="0000000"+spons_dfe_ein if temp1==2
	replace spons_dfe_ein="00000000"+spons_dfe_ein if temp1==1
	drop temp1

	*fix PN's wherever possible
	tostring spons_dfe_pn, force replace
	gen temp1=strlen(spons_dfe_pn)

	replace spons_dfe_pn="0"+spons_dfe_pn if temp1==2
	replace spons_dfe_pn="00"+spons_dfe_pn if temp1==1
	drop temp1

	*eliminate duplicate filings by keeping the most recent record 
	gsort spons_dfe_ein spons_dfe_pn -admin_signed_date
	//filingnum==1 corresponds to the most recent filing for an EIN-PN-year
	bysort spons_dfe_ein spons_dfe_pn: gen filingnum=_n 
	keep if filingnum==1
	drop filingnum

	*Plan types 
	//Code up DC plans
	foreach j in A B C D E F G H I J K L M N O 0 P Q R{ //O sometimes enters as zero
		gen _2`j'= regexm(type_pension_bnft_code,"2`j'")
	}
	egen DC = rowmax(_2A _2B _2C _2D _2E _2F _2G _2H _2I _2J _2K _2L _2M _2N _2O _20 _2P _2Q _2R)
	drop _2*

	//Code up DB plans (frozen DB's identified with 1I)
	foreach j in A B C D E G H I{
		gen _1`j'= regexm(type_pension_bnft_code,"1`j'") 
	}
	egen DB = rowmax(_1A _1B _1C _1D _1E _1G _1H _1I)
	drop _1*
	
	//Pure DC, pure DB, or mixed
	gen DC_only= (DC==1 & DB==0)
	gen DB_only= (DC==0 & DB==1)
	gen mixed_plan=1
	replace mixed_plan=0 if (DC_only==1 | DB_only==1)
	
	//cash balance=1C 
	gen cash_balance= regexm(type_pension_bnft_code,"1C")
	
	//freeze
	gen frozen_DB= regexm(type_pension_bnft_code,"1I")
	
	//termination
	gen terminated_DB= regexm(type_pension_bnft_code,"1H")

	*Participant counts
	gen plan_count=0

	*Count DC, DB participants...
	replace plan_count = tot_active_partcp_cnt if DC==1  //includes hybrid DB-DC plans
	replace plan_count = tot_active_partcp_cnt if DB==1 & DC==0 

	*Multiple plan firms
	sort spons_dfe_ein spons_dfe_pn
	bysort spons_dfe_ein: gen firm_plan_num = _n
	bysort spons_dfe_ein: egen num_plans = max(firm_plan_num)

	*tag and drop supplemental plans
	gen supplemental= (regexm(plan_name, "SUPPLEMENTAL") | regexm(plan_name,"PAST SERVICE")) & DB==1
	drop if supplemental==1

	*For a multi plan firm, the max DC enrollment across plans is assumed to be 
	*the total DC enrollment (i.e. all workers that are eligible are eligible for all plans)
	*This includes hybrid DB-DC plans
	bysort spons_dfe_ein: egen DC_enrollment = max(plan_count) if DC==1 
	replace DC_enrollment=0 if DC_enrollment==.

	*For a multi plan firm, sum of DB enrollment is assumed to 
	*reflect the total DB enrollment. (i.e. plans are created for specific workers)
	bysort spons_dfe_ein: egen DB_enrollment = sum(plan_count) if DB==1 & DC==0
	replace DB_enrollment=0 if DB_enrollment==.

	*keep key vars
	keep filing_id spons_dfe_ein spons_dfe_pn ///
	type_plan_entity_ind type_dfe_plan_entity type_plan_filing_ind ///
	collective_bargain_ind plan_eff_date DC cash_balance terminated_DB ///
	frozen_DB DB plan_count firm_plan_num num_plans DC_enrollment DB_enrollment ///
	type_pension_bnft_code business_code DB_only DC_only mixed_plan form_plan_year_begin_date ///
	form_tax_prd
	
	if `y'<2008{
		replace business_code=0 if business_code==.
		tostring business_code, force replace
	else if `y'==2008
		replace business_code="0" if business_code==""
	}
	
	*Create string for plan start date if currently numeric
	if `y'<2006{
		tostring plan_eff_date form_plan_year_begin_date form_tax_prd, force replace
	}
	
	tostring form_plan_year_begin_date form_tax_prd, force replace
	
	gen plan_start_date = date(form_plan_year_begin_date,"YMD")
	gen plan_end_date = date(form_tax_prd,"YMD")
	gen plan_start_yr = year(plan_start_date)
	gen plan_start_qtr = quarter(plan_start_date)
	gen plan_end_yr = year(plan_end_date)
	gen plan_end_qtr = quarter(plan_end_date)


	*Add admin year
	gen year=`y'

	*save
	save "$data_dir/cleaned/f_5500_`y'c.dta", replace

}

//2009-2022
forvalues y = 2009(1)2022{

	insheet using "$data_dir/F_5500_`y'/f_5500_`y'_all.csv", clear

	*drop plans without EIN's 
	drop if spons_dfe_ein==0
	*drop non-pension plans
	keep if type_pension_bnft_code!=""

	*keep identity, participation, and filing information
	rename type_plan_entity_cd type_plan_entity_ind
	
	keep ack_id spons_dfe_ein spons_dfe_pn form_plan_year_begin_date ///
	type_plan_entity_ind type_dfe_plan_entity plan_eff_date ///
	collective_bargain_ind business_code tot_partcp_boy_cnt type_pension_bnft_code ///
	tot_active_partcp_cnt rtd_sep_partcp_rcvg_cnt rtd_sep_partcp_fut_cnt ///
	subtl_act_rtd_sep_cnt benef_rcvg_bnft_cnt tot_act_rtd_sep_benef_cnt ///
	partcp_account_bal_cnt sep_partcp_partl_vstd_cnt ///
	admin_signed_date spons_signed_date plan_name date_received form_plan_year_begin_date ///
	form_tax_prd

	*fix EIN's wherever possible
	tostring spons_dfe_ein, force replace
	gen temp1=strlen(spons_dfe_ein)

	replace spons_dfe_ein="0"+spons_dfe_ein if temp1==8
	replace spons_dfe_ein="00"+spons_dfe_ein if temp1==7
	replace spons_dfe_ein="000"+spons_dfe_ein if temp1==6
	replace spons_dfe_ein="0000"+spons_dfe_ein if temp1==5
	replace spons_dfe_ein="00000"+spons_dfe_ein if temp1==4
	replace spons_dfe_ein="000000"+spons_dfe_ein if temp1==3
	replace spons_dfe_ein="0000000"+spons_dfe_ein if temp1==2
	replace spons_dfe_ein="00000000"+spons_dfe_ein if temp1==1
	drop temp1

	*fix PN's wherever possible
	tostring spons_dfe_pn, force replace
	gen temp1=strlen(spons_dfe_pn)

	replace spons_dfe_pn="0"+spons_dfe_pn if temp1==2
	replace spons_dfe_pn="00"+spons_dfe_pn if temp1==1
	drop temp1

	*eliminate duplicate filings by keeping the most recent record 
	gsort spons_dfe_ein spons_dfe_pn -date_received
	//filingnum==1 corresponds to the most recent filing for an EIN-PN-year
	bysort spons_dfe_ein spons_dfe_pn: gen filingnum=_n 
	keep if filingnum==1
	drop filingnum

	*Plan types 
	//Code up DC plans
	foreach j in A B C D E F G H I J K L M N O 0 P Q R{ //O sometimes enters as zero
		gen _2`j'= regexm(type_pension_bnft_code,"2`j'")
	}
	egen DC = rowmax(_2A _2B _2C _2D _2E _2F _2G _2H _2I _2J _2K _2L _2M _2N _2O _20 _2P _2Q _2R)
	drop _2*

	//Code up DB plans (frozen DB's identified with 1I)
	foreach j in A B C D E G H I{
		gen _1`j'= regexm(type_pension_bnft_code,"1`j'") 
	}
	egen DB = rowmax(_1A _1B _1C _1D _1E _1G _1H _1I)
	drop _1*
	
	//Pure DC, pure DB, or mixed
	gen DC_only= (DC==1 & DB==0)
	gen DB_only= (DC==0 & DB==1)
	gen mixed_plan=1
	replace mixed_plan=0 if (DC_only==1 | DB_only==1)
	
	//cash balance=1C 
	gen cash_balance= regexm(type_pension_bnft_code,"1C")
	
	//freeze
	gen frozen_DB= regexm(type_pension_bnft_code,"1I")
	
	//termination
	gen terminated_DB= regexm(type_pension_bnft_code,"1H")

	*Participant counts
	gen plan_count=0

	*Count DC, DB participants...
	replace plan_count = tot_active_partcp_cnt if DC==1  //includes hybrid DB-DC plans
	replace plan_count = tot_active_partcp_cnt if DB==1 & DC==0 

	*Multiple plan firms
	sort spons_dfe_ein spons_dfe_pn
	bysort spons_dfe_ein: gen firm_plan_num = _n
	bysort spons_dfe_ein: egen num_plans = max(firm_plan_num)

	*tag and drop supplemental plans
	gen supplemental= (regexm(plan_name, "SUPPLEMENTAL") | regexm(plan_name,"PAST SERVICE")) & DB==1
	drop if supplemental==1

	*For a multi plan firm, the max DC enrollment across plans is assumed to be 
	*the total DC enrollment (i.e. all workers that are eligible are eligible for all plans)
	*This includes hybrid DB-DC plans
	bysort spons_dfe_ein: egen DC_enrollment = max(plan_count) if DC==1 
	replace DC_enrollment=0 if DC_enrollment==.

	*For a multi plan firm, sum of DB enrollment is assumed to 
	*reflect the total DB enrollment. (i.e. plans are created for specific workers)
	bysort spons_dfe_ein: egen DB_enrollment = sum(plan_count) if DB==1 & DC==0
	replace DB_enrollment=0 if DB_enrollment==.

	*keep key vars
	keep ack_id spons_dfe_ein spons_dfe_pn  ///
	type_plan_entity_ind type_dfe_plan_entity ///
	collective_bargain_ind plan_eff_date DC cash_balance terminated_DB ///
	frozen_DB DB plan_count firm_plan_num num_plans DC_enrollment DB_enrollment ///
	type_pension_bnft_code business_code DB_only DC_only mixed_plan form_plan_year_begin_date ///
	form_tax_prd
	
	replace business_code=0 if business_code==.
	tostring business_code, force replace
	
	tostring form_plan_year_begin_date form_tax_prd, force replace
	
	gen plan_start_date = date(form_plan_year_begin_date,"YMD")
	gen plan_end_date = date(form_tax_prd,"YMD")
	gen plan_start_yr = year(plan_start_date)
	gen plan_start_qtr = quarter(plan_start_date)
	gen plan_end_yr = year(plan_end_date)
	gen plan_end_qtr = quarter(plan_end_date)
	
	*Add admin year
	gen year=`y'

	*save
	save "$data_dir/cleaned/f_5500_`y'c.dta", replace

}

//2009-2022 SF filings
forvalues y = 2020(1)2022{ //2009-2022

	insheet using "$data_dir/F_5500_SF_`y'/f_5500_sf_`y'_all.csv", clear

	*drop plans without EIN's 
	drop if sf_spons_ein==0
	*drop non-pension plans
	keep if sf_type_pension_bnft_code!=""

	*rename variables to make them consistent with the F5500 main filing
	rename sf_spons_ein spons_dfe_ein
	rename sf_plan_num spons_dfe_pn
	rename sf_plan_year_begin_date form_plan_year_begin_date
	rename sf_tax_prd form_tax_prd
	rename sf_plan_entity_cd type_plan_entity_ind
	rename sf_plan_eff_date plan_eff_date
	rename sf_business_code business_code
	rename sf_tot_partcp_boy_cnt tot_partcp_boy_cnt
	rename sf_type_pension_bnft_code type_pension_bnft_code
	rename sf_tot_act_rtd_sep_benef_cnt tot_act_rtd_sep_benef_cnt
	rename sf_plan_name plan_name
	
	keep ack_id spons_dfe_ein spons_dfe_pn form_plan_year_begin_date form_tax_prd ///
	type_plan_entity_ind plan_eff_date business_code ///
	tot_partcp_boy_cnt type_pension_bnft_code ///
	tot_act_rtd_sep_benef_cnt date_received plan_name

	*fix EIN's wherever possible
	tostring spons_dfe_ein, force replace
	gen temp1=strlen(spons_dfe_ein)

	replace spons_dfe_ein="0"+spons_dfe_ein if temp1==8
	replace spons_dfe_ein="00"+spons_dfe_ein if temp1==7
	replace spons_dfe_ein="000"+spons_dfe_ein if temp1==6
	replace spons_dfe_ein="0000"+spons_dfe_ein if temp1==5
	replace spons_dfe_ein="00000"+spons_dfe_ein if temp1==4
	replace spons_dfe_ein="000000"+spons_dfe_ein if temp1==3
	replace spons_dfe_ein="0000000"+spons_dfe_ein if temp1==2
	replace spons_dfe_ein="00000000"+spons_dfe_ein if temp1==1
	drop temp1

	*fix PN's wherever possible
	tostring spons_dfe_pn, force replace
	gen temp1=strlen(spons_dfe_pn)

	replace spons_dfe_pn="0"+spons_dfe_pn if temp1==2
	replace spons_dfe_pn="00"+spons_dfe_pn if temp1==1
	drop temp1

	*eliminate duplicate filings by keeping the most recent record 
	gsort spons_dfe_ein spons_dfe_pn -date_received
	//filingnum==1 corresponds to the most recent filing for an EIN-PN-year
	bysort spons_dfe_ein spons_dfe_pn: gen filingnum=_n 
	keep if filingnum==1
	drop filingnum

	*Plan types 
	//Code up DC plans
	foreach j in A B C D E F G H I J K L M N O 0 P Q R{ //O sometimes enters as zero
		gen _2`j'= regexm(type_pension_bnft_code,"2`j'")
	}
	egen DC = rowmax(_2A _2B _2C _2D _2E _2F _2G _2H _2I _2J _2K _2L _2M _2N _2O _20 _2P _2Q _2R)
	drop _2*

	//Code up DB plans (frozen DB's identified with 1I)
	foreach j in A B C D E G H I{
		gen _1`j'= regexm(type_pension_bnft_code,"1`j'") 
	}
	egen DB = rowmax(_1A _1B _1C _1D _1E _1G _1H _1I)
	drop _1*
	
	//Pure DC, pure DB, or mixed
	gen DC_only= (DC==1 & DB==0)
	gen DB_only= (DC==0 & DB==1)
	gen mixed_plan=1
	replace mixed_plan=0 if (DC_only==1 | DB_only==1)
	
	//cash balance=1C 
	gen cash_balance= regexm(type_pension_bnft_code,"1C")
	
	//freeze
	gen frozen_DB= regexm(type_pension_bnft_code,"1I")
	
	//termination
	gen terminated_DB= regexm(type_pension_bnft_code,"1H")

	*Participant counts
	gen plan_count=0

	*Count DC, DB participants...
	replace plan_count = tot_partcp_boy_cnt 

	*Multiple plan firms
	sort spons_dfe_ein spons_dfe_pn
	bysort spons_dfe_ein: gen firm_plan_num = _n
	bysort spons_dfe_ein: egen num_plans = max(firm_plan_num)

	*tag and drop supplemental plans
	gen supplemental= (regexm(plan_name, "SUPPLEMENTAL") | regexm(plan_name,"PAST SERVICE")) & DB==1
	drop if supplemental==1

	*For a multi plan firm, the max DC enrollment across plans is assumed to be 
	*the total DC enrollment (i.e. all workers that are eligible are eligible for all plans)
	*This includes hybrid DB-DC plans
	bysort spons_dfe_ein: egen DC_enrollment = max(plan_count) if DC==1 
	replace DC_enrollment=0 if DC_enrollment==.

	*For a multi plan firm, sum of DB enrollment is assumed to 
	*reflect the total DB enrollment. (i.e. plans are created for specific workers)
	bysort spons_dfe_ein: egen DB_enrollment = sum(plan_count) if DB==1 & DC==0
	replace DB_enrollment=0 if DB_enrollment==.

	*keep key vars
	keep ack_id spons_dfe_ein spons_dfe_pn  ///
	type_plan_entity_ind plan_eff_date DC cash_balance terminated_DB ///
	frozen_DB DB plan_count firm_plan_num num_plans DC_enrollment DB_enrollment ///
	type_pension_bnft_code business_code DB_only DC_only mixed_plan form_plan_year_begin_date form_tax_prd
	
	replace business_code=0 if business_code==.
	tostring business_code, force replace
	

	gen plan_start_date = date(form_plan_year_begin_date,"YMD")
	gen plan_end_date = date(form_tax_prd,"YMD")
	gen plan_start_yr = year(plan_start_date)
	gen plan_start_qtr = quarter(plan_start_date)
	gen plan_end_yr = year(plan_end_date)
	gen plan_end_qtr = quarter(plan_end_date)
	
	*Add admin year
	gen year=`y'
	
	*SF plan filing flag
	gen sf_form=1

	*save
	save "$data_dir/cleaned/f_5500_sf_`y'c.dta", replace
}

*-------------------------------------------------------------------------------
*Append the files to create a panel of plan level observations
*-------------------------------------------------------------------------------

/*
forvalues y = 2009/2015{
	use "$data_dir/cleaned/f_5500_`y'c.dta", clear
	drop form_plan_year_begin_date business_code
	save "$data_dir/cleaned/f_5500_`y'c.dta", replace
}
*/

use "$data_dir/cleaned/f_5500_1999c.dta", clear

forvalues y=2000(1)2022{
	append using "$data_dir/cleaned/f_5500_`y'c.dta"
}

forvalues y=2009(1)2022{
	append using "$data_dir/cleaned/f_5500_sf_`y'c.dta"
}

sort spons_dfe_ein spons_dfe_pn year
order spons_dfe_ein spons_dfe_pn year
save "$data_dir/cleaned/plan_panel_9922c.dta", replace


*-------------------------------------------------------------------------------
*Time series of freezes & Cb conversions
*-------------------------------------------------------------------------------

use "$data_dir/cleaned/plan_panel_9922c.dta", clear

*Freezes and conversions (plan rates)
bysort year: egen num_db= sum(DB)
gen cbfrz = (frozen_DB==1 | cash_balance==1)
bysort year: egen num_cbfrz = sum(cbfrz)
gen cbfrz_rate = num_cbfrz/num_db

*Eliminate 1 outlier (error in the 2008 data: EIN 042103603)
drop if plan_count>500000 & plan_count!=. & DB==1  

*Freezes and conversions (participant rates)
gen DB_plan_count = plan_count*DB
gen DB_cbfrz_count = plan_count*cbfrz
bysort year: egen num_db_ptcp= sum(DB_plan_count) 
bysort year: egen num_cbfrz_ptcp = sum(DB_cbfrz_count) 
gen ptcp_cbfrz_rate = num_cbfrz_ptcp/num_db_ptcp

*Year of freeze
gen temp = cbfrz*year
replace temp = . if temp==0
bysort spons_dfe_ein spons_dfe_pn: egen year_cbfrz=min(temp)


*Save statistics
*Fraction of frozen/CB plans
tabstat cbfrz_rate, by(year) save
matrix cbfrz_rate = J(24,2,.)
local yr_start=1999
forvalues j=1/24{
	matrix cbfrz_rate[`j',1]=`yr_start'
	matrix cbfrz_rate[`j',2]=r(Stat`j')
	local yr_start = `yr_start'+1
}

*Export to csv 

clear
svmat cbfrz_rate
rename cbfrz_rate1 year
rename cbfrz_rate2 frac_plans_cbfrz
svmat ptcp_cbfrz_rate
drop ptcp_cbfrz_rate1
rename ptcp_cbfrz_rate2 frac_ptcp_cbfrz

export delimited using ".../data/cbfrz_rate_tseries.csv", replace



